NBA Player Archetype Clustering with Neural
Networks
#Preprocessing can take 15-30 min to run depending on the PC
library(torch)
library(dplyr)
library(ggplot2)
library(stats) # for kmeans
library(ggrepel)
library(patchwork)
library(tidyr)
library(purrr)
library(uwot)
library(plotly)
# Set global seed for reproducibility
set.seed(67)
# Set seed for torch (PyTorch)
torch_manual_seed(67)
# Load data
player_stats <- read.csv("~/Desktop/Stat 240/data/PlayerStatistics.csv")
# Define feature columns
feature_cols <- c(
"points", "assists", "reboundsOffensive", "reboundsDefensive",
"steals", "blocks", "turnovers", "fieldGoalsAttempted",
"fieldGoalsMade", "fieldGoalsPercentage", "threePointersAttempted",
"threePointersMade", "threePointersPercentage", "freeThrowsAttempted",
"freeThrowsMade", "freeThrowsPercentage", "numMinutes"
)
# Filter and extract features
player_stats_filtered <- player_stats %>%
filter(complete.cases(select(., all_of(feature_cols))))
features <- player_stats_filtered %>% select(all_of(feature_cols))
# Normalize features
features_scaled <- scale(features)
features_mat <- as.matrix(features_scaled)
# Convert to torch tensor
x <- torch_tensor(features_mat, dtype = torch_float())
# Define autoencoder
autoencoder <- nn_module(
initialize = function(input_dim, embedding_dim) {
self$encoder <- nn_sequential(
nn_linear(input_dim, 64),
nn_relu(),
nn_linear(64, embedding_dim)
)
self$decoder <- nn_sequential(
nn_linear(embedding_dim, 64),
nn_relu(),
nn_linear(64, input_dim)
)
},
forward = function(x) {
z <- self$encoder(x)
x_recon <- self$decoder(z)
list(embedding = z, reconstruction = x_recon)
}
)
input_dim <- ncol(features_mat)
embedding_dim <- 8
# Instantiate and train model
model <- autoencoder(input_dim = input_dim, embedding_dim = embedding_dim)
optimizer <- optim_adam(model$parameters, lr = 0.001)
loss_fn <- nn_mse_loss()
num_epochs <- 10
batch_size <- 128
num_batches <- ceiling(nrow(features_mat) / batch_size)
for (epoch in 1:num_epochs) {
model$train()
total_loss <- 0
for (batch in 1:num_batches) {
start_idx <- (batch - 1) * batch_size + 1
end_idx <- min(batch * batch_size, nrow(features_mat))
batch_x <- x[start_idx:end_idx, ]
optimizer$zero_grad()
output <- model(batch_x)
loss <- loss_fn(output$reconstruction, batch_x)
loss$backward()
optimizer$step()
total_loss <- total_loss + loss$item()
}
}
# Get embeddings
model$eval()
with_no_grad({
embeddings <- model$encoder(x)
})
# Convert embeddings to matrix
embeddings_mat <- as.array(embeddings)
colnames(embeddings_mat) <- paste0("V", 1:ncol(embeddings_mat))
# Cluster embeddings
set.seed(67)
k <- 7
clusters <- kmeans(embeddings_mat, centers = k)
# Attach cluster labels
player_clusters <- data.frame(embeddings_mat)
player_clusters$cluster <- as.factor(clusters$cluster)
# Perform PCA
pca_result <- prcomp(embeddings_mat, center = TRUE, scale. = TRUE)
pca_scores <- as.data.frame(pca_result$x[, 1:2])
colnames(pca_scores) <- c("PC1", "PC2")
pca_scores$cluster <- clusters$cluster
# Combine player names and clusters
player_clusters_full <- player_stats_filtered %>%
select(firstName, lastName, personId) %>%
bind_cols(
as.data.frame(embeddings_mat),
cluster = clusters$cluster
) %>%
mutate(fullName = paste(firstName, lastName))
# --- Compute 3D UMAP embeddings ---
set.seed(67)
umap_3d <- umap(embeddings_mat, n_neighbors = 15, n_components = 3, metric = "euclidean")
umap_df <- as.data.frame(umap_3d)
colnames(umap_df) <- c("UMAP1", "UMAP2", "UMAP3")
umap_df$cluster <- as.factor(clusters$cluster)
umap_df$player <- player_clusters_full$fullName
# Function to find top representative players
find_top_players <- function(cluster_num, n = 5) {
cluster_data <- embeddings_mat[clusters$cluster == cluster_num, ]
center <- clusters$centers[cluster_num, ]
distances <- apply(cluster_data, 1, function(x) sqrt(sum((x - center)^2)))
top_indices <- order(distances)[1:n]
cluster_players <- player_clusters_full %>% filter(cluster == cluster_num)
top_players <- cluster_players[top_indices, ] %>% select(fullName, personId, cluster)
data.frame(Player = top_players$fullName, ID = top_players$personId, Distance = distances[top_indices])
}
# Function to find most distinctive players
find_extreme_players <- function(cluster_num, n = 5) {
cluster_data <- embeddings_mat[clusters$cluster == cluster_num, ]
other_centers <- clusters$centers[-cluster_num, ]
distances <- apply(cluster_data, 1, function(x) mean(apply(other_centers, 1, function(y) sqrt(sum((x - y)^2)))))
extreme_indices <- order(distances, decreasing = TRUE)[1:n]
cluster_players <- player_clusters_full %>% filter(cluster == cluster_num)
extreme_players <- cluster_players[extreme_indices, ] %>% select(fullName, personId, cluster)
data.frame(Player = extreme_players$fullName, ID = extreme_players$personId, Distinctiveness = distances[extreme_indices])
}
# Get top and distinctive players
top_players_list <- map(1:k, function(i) find_top_players(i))
extreme_players_list <- map(1:k, function(i) find_extreme_players(i))
library(ggplot2)
library(plotly)
library(dplyr)
library(patchwork) # for plot_layout and plot_annotation
cluster_labels <- c(
"1" = "Versatile Role Player",
"2" = "Sharp Shooters",
"3" = "Star Scorers",
"4" = "Dominant Bigs",
"5" = "Bench Players",
"6" = "Supporting Guards",
"7" = "Energy Bigs"
)
manual_colors <- c(
"Versatile Role Player" = "#E41A1C",
"Sharp Shooters" = "#377EB8",
"Star Scorers" = "#4DAF4A",
"Dominant Bigs" = "#984EA3",
"Bench Players" = "#FF7F00",
"Supporting Guards" = "#FFFF33",
"Energy Bigs" = "#A65628"
)
# Convert numeric cluster to character first to use as keys for cluster_labels
pca_scores$cluster_label <- factor(
cluster_labels[as.character(pca_scores$cluster)],
levels = names(manual_colors)
)
umap_df$cluster <- as.character(umap_df$cluster) # ensure character for indexing
umap_df$cluster_label <- factor(
cluster_labels[umap_df$cluster],
levels = names(manual_colors)
)
explained_var <- pca_result$sdev^2
explained_var_percent <- explained_var / sum(explained_var) * 100
explained_var_percent[1:5]
## [1] 41.194260 26.759893 16.171984 7.338242 4.486408
x_lab <- paste0("PC1 (", round(explained_var_percent[1], 1), "% variance)")
y_lab <- paste0("PC2 (", round(explained_var_percent[2], 1), "% variance)")
# Your cluster labels named by cluster number
top_players_plot <- bind_rows(top_players_list, .id = "cluster") %>%
group_by(cluster) %>%
slice_head(n = 3) %>%
ungroup() %>%
mutate(cluster_label = factor(cluster_labels[cluster], levels = names(manual_colors))) %>%
ggplot(aes(reorder(Player, -Distance), Distance, fill = cluster_label)) +
geom_col() +
facet_wrap(~cluster_label, scales = "free_x", nrow = 1) +
scale_fill_manual(values = manual_colors) +
labs(title = "Most Representative Players per Cluster", x = "", y = "Distance to Center") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 60, hjust = 1, size = 8),
axis.text.y = element_text(size = 6),
strip.text = element_blank(), # THIS removes facet strip labels
legend.position = "none"
)
distinctive_players_plot <- bind_rows(extreme_players_list, .id = "cluster") %>%
group_by(cluster) %>%
slice_head(n = 3) %>%
ungroup() %>%
mutate(cluster_label = factor(cluster_labels[cluster], levels = names(manual_colors))) %>%
ggplot(aes(reorder(Player, -Distinctiveness), Distinctiveness, fill = cluster_label)) +
geom_col() +
facet_wrap(~cluster_label, scales = "free_x", nrow = 1) +
scale_fill_manual(values = manual_colors) +
labs(title = "Most Distinctive Players per Cluster", x = "", y = "Distance from other Clusters") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 60, hjust = 1, size = 8),
axis.text.y = element_text(size = 6),
strip.text = element_blank(), # THIS removes facet strip labels
legend.position = "none"
)
# PCA plot with manual colors and factor cluster_label
pca_plot <- ggplot(pca_scores, aes(PC1, PC2, color = cluster_label)) +
geom_point(alpha = 0.4, size = 1.5) +
scale_color_manual(values = manual_colors, name = "Cluster Type") +
labs(
title = "Player Archetypes in PCA Space",
subtitle = "Each cluster represents distinct play styles",
x = x_lab,
y = y_lab
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "right",
legend.text = element_text(size = 8),
legend.key.size = unit(0.5, "cm")
)
print(pca_plot)

# Print the two bar plots stacked vertically
print(top_players_plot / distinctive_players_plot)

# Convert cluster column to character for mapping to labels
umap_df$cluster <- as.character(umap_df$cluster)
# Map cluster IDs to descriptive labels, keeping the order consistent
umap_df$cluster_label <- factor(
cluster_labels[umap_df$cluster],
levels = names(manual_colors)
)
# Randomly sample 200,000 rows to reduce plot size and improve performance
set.seed(67)
sampled_indices <- sample(nrow(umap_df), 200000)
umap_sampled <- umap_df[sampled_indices, ]
umap_3d_plot <- plot_ly(
data = umap_sampled,
x = ~UMAP1,
y = ~UMAP2,
z = ~UMAP3,
color = ~cluster_label,
colors = manual_colors,
text = ~player,
type = "scatter3d",
mode = "markers",
marker = list(size = 4, opacity = 0.7)
) %>%
layout(
title = list(
text = paste0(
"3D UMAP Visualization of Player Embeddings (Sampled 200k)<br>",
"<sup>Hover over points to see player names. Colors show player archetype clusters.<br><br>"
),
x = 0.5
),
scene = list(
xaxis = list(title = "UMAP1"),
yaxis = list(title = "UMAP2"),
zaxis = list(title = "UMAP3")
),
annotations = list(
list(
text = "Instructions: Rotate the plot with mouse drag. Zoom with scroll.",
xref = "paper",
yref = "paper",
x = 0,
y = -0.1,
showarrow = FALSE,
font = list(size = 12)
)
)
)
umap_3d_plot